Import data in list via function Calll to import_csv_data().
data <- import_csv_data(path = "Olist_data/")
data_marketing <- import_csv_data(path = "Olist_Marketing_data/")
Source https://www.kaggle.com/jungjoonlee/eda-with-ecommerce-marketplace-seller-side
Marketing Qualified Lead (MQL) means a potential reseller/manufacturer who has an interest in selling their products on Olist. After a MQL filled a form on landing page to sign up for seller, a Sales Development Representative(SDR) contacted the MQL and gathered more information about the lead. Then a Sales Representative(SR) consulted the MQL. So interaction between SDRs/SRs and MQLs can affect conversion from MQLs to sellers. A MQL who finally signed up for seller is called a closed deal.
skim(data_marketing$olist_closed_deals_dataset)
| Name | data_marketing$olist_clos… |
| Number of rows | 842 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| factor | 12 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| mql_id | 0 | 1 | FALSE | 842 | 000: 1, 009: 1, 00d: 1, 010: 1 |
| seller_id | 0 | 1 | FALSE | 842 | 000: 1, 012: 1, 013: 1, 01f: 1 |
| sdr_id | 0 | 1 | FALSE | 32 | 4b3: 140, 068: 81, 56b: 74, 9d1: 66 |
| sr_id | 0 | 1 | FALSE | 22 | 4ef: 133, d3d: 82, 656: 74, 85f: 64 |
| won_date | 0 | 1 | FALSE | 824 | 201: 6, 201: 4, 201: 3, 201: 3 |
| business_segment | 0 | 1 | FALSE | 34 | hom: 105, hea: 93, car: 77, hou: 71 |
| lead_type | 0 | 1 | FALSE | 9 | onl: 332, onl: 126, ind: 123, off: 104 |
| lead_behaviour_profile | 0 | 1 | FALSE | 10 | cat: 407, emp: 177, eag: 123, wol: 95 |
| has_company | 0 | 1 | FALSE | 3 | emp: 779, Tru: 58, Fal: 5 |
| has_gtin | 0 | 1 | FALSE | 3 | emp: 778, Tru: 54, Fal: 10 |
| average_stock | 0 | 1 | FALSE | 7 | emp: 776, 5-2: 22, 50-: 15, 1-5: 10 |
| business_type | 0 | 1 | FALSE | 4 | res: 587, man: 242, emp: 10, oth: 3 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| declared_product_catalog_size | 773 | 0.08 | 233.03 | 352.38 | 1 | 30 | 100 | 300 | 2e+03 | ▇▁▁▁▁ |
| declared_monthly_revenue | 0 | 1.00 | 73377.68 | 1744799.18 | 0 | 0 | 0 | 0 | 5e+07 | ▇▁▁▁▁ |
head(data_marketing$olist_closed_deals_dataset) %>% datatable(., rownames = FALSE, options = list(scrollX = TRUE))
skim(data_marketing$olist_marketing_qualified_leads_dataset)
| Name | data_marketing$olist_mark… |
| Number of rows | 8000 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| factor | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| mql_id | 0 | 1 | FALSE | 8000 | 000: 1, 000: 1, 001: 1, 002: 1 |
| first_contact_date | 0 | 1 | FALSE | 336 | 201: 93, 201: 76, 201: 75, 201: 71 |
| landing_page_id | 0 | 1 | FALSE | 495 | b76: 912, 22c: 883, 583: 495, 887: 445 |
| origin | 0 | 1 | FALSE | 11 | org: 2296, pai: 1586, soc: 1350, unk: 1099 |
head(data_marketing$olist_marketing_qualified_leads_dataset) %>% datatable(., rownames = FALSE, options = list(scrollX = TRUE))
# merge dataframes
df_deals <- data_marketing$olist_closed_deals_dataset %>%
right_join(data_marketing$olist_marketing_qualified_leads_dataset, by = "mql_id")
# createa monthly dataframe for deals by origin by first contact date
df_deals_monthly <- df_deals %>%
mutate(first_contact_month = format.Date(as_date(first_contact_date), "%Y%m"),
deal_closed = if_else(is.na(seller_id), 0, 1),
origin = if_else(origin == "", "unknown", as.character(origin))) %>%
group_by(origin, first_contact_month) %>%
summarize(leads = n(),
closed_deals = sum(deal_closed)) %>%
ungroup()
plot_leads <- df_deals_monthly %>%
group_by(first_contact_month) %>%
summarize(leads = sum(leads)) %>%
ggplot(., aes(x = first_contact_month, y = leads, group = 1)) +
geom_line() +
geom_point() +
theme_bw() +
labs(x = "", y = "Number of Leads", colour = "origin")
ggplotly(plot_leads)
#plot_leads
plot_leads_origin <- df_deals_monthly %>%
select(-closed_deals) %>%
pivot_wider(., names_from = origin, values_from = leads) %>%
replace(is.na(.), 0) %>%
ggplot(., aes(x = first_contact_month, group = 1)) +
geom_line(aes(y = direct_traffic, color = "direct_traffic")) +
geom_line(aes(y = display, color = "display"))+
geom_line(aes(y = email, color = "email")) +
geom_line(aes(y = organic_search, color = "organic_search")) +
geom_line(aes(y = paid_search, color = "paid_search")) +
geom_line(aes(y = referral, color = "referral")) +
geom_line(aes(y = social, color = "social")) +
geom_line(aes(y = other_publicities, color = "other_publicities")) +
geom_line(aes(y = other, color = "other")) +
geom_line(aes(y = unknown, color = "unknown")) +
geom_point(aes(y = direct_traffic, color = "direct_traffic")) +
geom_point(aes(y = display, color = "display"))+
geom_point(aes(y = email, color = "email")) +
geom_point(aes(y = organic_search, color = "organic_search")) +
geom_point(aes(y = paid_search, color = "paid_search")) +
geom_point(aes(y = referral, color = "referral")) +
geom_point(aes(y = social, color = "social")) +
geom_point(aes(y = other_publicities, color = "other_publicities")) +
geom_point(aes(y = other, color = "other")) +
geom_point(aes(y = unknown, color = "unknown")) +
theme_bw() +
labs(x = "", y = "Number of Leads", colour = "origin")
ggplotly(plot_leads_origin)
#plot_leads_origin
plot_bar_leads_origin <- df_deals_monthly %>%
select(-closed_deals) %>%
group_by(origin) %>%
summarize(leads = sum(leads)) %>%
ggplot(., aes(x = reorder(origin, -leads), y = leads, group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = ..y..), vjust = -1) +
theme_bw() +
labs(x = "", y = "Number of Leads", colour = "origin") +
scale_y_continuous(limits = c(0, 2350))
ggplotly(plot_bar_leads_origin)
#plot_bar_leads_origin
plot_bar_leads_origin_per <- df_deals_monthly %>%
select(-closed_deals) %>%
group_by(origin) %>%
summarize(leads = sum(leads)) %>%
ggplot(., aes(x = reorder(origin, -leads), y = leads / sum(leads), group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
theme_bw() +
labs(x = "", y = "Number of Leads", colour = "origin") +
scale_y_continuous(labels=scales::percent, limits = c(0, 0.3))
ggplotly(plot_bar_leads_origin_per)
#plot_bar_leads_origin_per
plot_leads <- df_deals_monthly %>%
group_by(first_contact_month) %>%
summarize(leads = sum(closed_deals)) %>%
ggplot(., aes(x = first_contact_month, y = leads, group = 1)) +
geom_line() +
geom_point() +
theme_bw() +
labs(x = "", y = "Number of Closed Deals", colour = "origin")
ggplotly(plot_leads)
# plot_leads
plot_leads_origin <- df_deals_monthly %>%
select(-leads) %>%
pivot_wider(., names_from = origin, values_from = closed_deals) %>%
replace(is.na(.), 0) %>%
ggplot(., aes(x = first_contact_month, group = 1)) +
geom_line(aes(y = direct_traffic, color = "direct_traffic")) +
geom_line(aes(y = display, color = "display"))+
geom_line(aes(y = email, color = "email")) +
geom_line(aes(y = organic_search, color = "organic_search")) +
geom_line(aes(y = paid_search, color = "paid_search")) +
geom_line(aes(y = referral, color = "referral")) +
geom_line(aes(y = social, color = "social")) +
geom_line(aes(y = other_publicities, color = "other_publicities")) +
geom_line(aes(y = other, color = "other")) +
geom_line(aes(y = unknown, color = "unknown")) +
geom_point(aes(y = direct_traffic, color = "direct_traffic")) +
geom_point(aes(y = display, color = "display"))+
geom_point(aes(y = email, color = "email")) +
geom_point(aes(y = organic_search, color = "organic_search")) +
geom_point(aes(y = paid_search, color = "paid_search")) +
geom_point(aes(y = referral, color = "referral")) +
geom_point(aes(y = social, color = "social")) +
geom_point(aes(y = other_publicities, color = "other_publicities")) +
geom_point(aes(y = other, color = "other")) +
geom_point(aes(y = unknown, color = "unknown")) +
theme_bw() +
labs(x = "", y = "Number of Closed Deals", colour = "origin")
ggplotly(plot_leads_origin)
# plot_leads_origin
plot_bar_leads_origin <- df_deals_monthly %>%
select(-leads) %>%
group_by(origin) %>%
summarize(leads = sum(closed_deals)) %>%
ggplot(., aes(x = reorder(origin, -leads), y = leads, group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = ..y..), vjust = -1) +
theme_bw() +
labs(x = "", y = "Number of Closed Deals", colour = "origin") +
scale_y_continuous(limits = c(0, 300))
ggplotly(plot_bar_leads_origin)
# plot_bar_leads_origin
plot_bar_leads_origin_per <- df_deals_monthly %>%
select(-leads) %>%
group_by(origin) %>%
summarize(closed_deals = sum(closed_deals)) %>%
ggplot(., aes(x = reorder(origin, -closed_deals), y = closed_deals / sum(closed_deals), group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
theme_bw() +
labs(x = "", y = "Number of Closed Deals", colour = "origin") +
scale_y_continuous(labels=scales::percent, limits = c(0, 0.35))
ggplotly(plot_bar_leads_origin_per)
# plot_bar_leads_origin_per
plot_conversion <- df_deals_monthly %>%
group_by(first_contact_month) %>%
summarize(leads = sum(leads),
closed_deals = sum(closed_deals)) %>%
mutate(conversion_rate = closed_deals / leads) %>%
ggplot(., aes(x = first_contact_month, y = conversion_rate, group = 1)) +
geom_line() +
geom_point() +
theme_bw() +
labs(x = "", y = "Conversion Rate", colour = "origin") +
scale_y_continuous(labels=scales::percent)
ggplotly(plot_conversion)
# plot_conversion
plot_bar_conversion_origin_2017 <- df_deals_monthly %>%
filter(substr(first_contact_month,1,4) == "2017") %>%
group_by(origin) %>%
summarize(leads = sum(leads),
closed_deals = sum(closed_deals)) %>%
mutate(conversion_rate = closed_deals / leads) %>%
ggplot(., aes(x = reorder(origin, -conversion_rate), y = conversion_rate, group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
theme_bw() +
labs(x = "", y = "Conversion Rate", colour = "origin") +
scale_y_continuous(labels=scales::percent, limits = c(0, 0.20))
ggplotly(plot_bar_conversion_origin_2017)
# plot_bar_conversion_origin_2017
plot_bar_conversion_origin_2018 <- df_deals_monthly %>%
filter(substr(first_contact_month,1,4) == "2018") %>%
group_by(origin) %>%
summarize(leads = sum(leads),
closed_deals = sum(closed_deals)) %>%
mutate(conversion_rate = closed_deals / leads) %>%
ggplot(., aes(x = reorder(origin, -conversion_rate), y = conversion_rate, group = 1)) +
geom_col(aes(fill = origin))+
geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
theme_bw() +
labs(x = "", y = "Conversion Rate", colour = "origin") +
scale_y_continuous(labels=scales::percent, limits = c(0, 0.30))
ggplotly(plot_bar_conversion_origin_2018)
# plot_bar_conversion_origin_2018